import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
Load commit data.
all_commits = pd.read_csv('output/all_commits.csv', parse_dates=['datetime'], index_col='commit')
all_commits
| author | datetime | message | jira | ||
|---|---|---|---|---|---|
| commit | |||||
| 5ad435d39e39558b98af3af5aca291eef22651f0 | tallison | tallison@apache.org | 2021-01-12 15:10:23-05:00 | TIKA-3267 change boolean getXYZ to boolean isXYZ | TIKA-3267 |
| e8b4305ce80df59e45352e73ceb972f4534c984c | tallison | tallison@apache.org | 2021-01-11 17:05:01-05:00 | TIKA-3269 update artifacts for 2.0.0 | TIKA-3269 |
| b0fb00ef6a11db121c56436b9542a4cf248fef8b | Tim Allison | tallison@apache.org | 2021-01-11 15:48:10-05:00 | TIKA-3266 (#396) | TIKA-3266 |
| 566c5962101db0be4ff56c6ff3740c05701a3c0e | THausherr | tilman@snafu.de | 2021-01-10 14:21:20+01:00 | TIKA-3244: update zstd | TIKA-3244 |
| d732cc17ea555572d1b1fe9e041931fb36567cdf | tallison | tallison@apache.org | 2021-01-07 16:45:31-05:00 | TIKA-3268 -- throw exception if excluded parse... | TIKA-3268 |
| ... | ... | ... | ... | ... | ... |
| c5417bed9a7f84acb913f0197e241ecb0d1205b4 | Jukka Zitting | jukka@apache.org | 2007-03-31 12:40:53+00:00 | TIKA-2: Basic web site based on Maven 2. | TIKA-2 |
| 6e750bb72e8be60c27bcf6bf6dd3a05742c812fc | Jukka Zitting | jukka@apache.org | 2007-03-31 10:35:09+00:00 | TIKA-4: Ignore Eclipse project files. | TIKA-4 |
| 3794e9a8a99172f9666befbba686af29276200ea | Jukka Zitting | jukka@apache.org | 2007-03-31 10:31:15+00:00 | TIKA-4: Basic Maven 2 POM and source tree for ... | TIKA-4 |
| f10cd4207c54fc45e4b244b5e538f9943c697c5b | Jukka Zitting | jukka@apache.org | 2007-03-31 07:40:09+00:00 | TIKA-1: Standard README, NOTICE, and LICENSE f... | TIKA-1 |
| 19ab44f873feae6d8acedfe662d1839e4aece854 | Jukka Zitting | jukka@apache.org | 2007-03-31 07:22:55+00:00 | TIKA-1: Standard {trunk,branches,tags} setup | TIKA-1 |
4984 rows × 5 columns
Dispose some columns we don't need.
all_commits = all_commits[['author', 'datetime', 'jira']]
Make sure the date rows are indeed parsed as a datetime.
all_commits['datetime'].values[0]
datetime.datetime(2021, 1, 12, 15, 10, 23, tzinfo=tzoffset(None, -18000))
Load the aggregated report file, all_reports.csv. Contains Designite report data for every single Tika commit.
all_reports = pd.read_csv('output/all_reports.csv', dtype={'package': 'category'},
index_col=['package', 'commit'])
all_reports = all_reports.rename(columns={'metric': '# classes'}) # rename col
all_reports
| repo | smell | cause | # classes | ||
|---|---|---|---|---|---|
| package | commit | ||||
| org.apache.tika.example | 49bb4691393c016d8d65e6b11febca9e56feedef | tika-cpu_21 | God Component | MANY_CLASSES | 49 |
| org.apache.tika.batch | 49bb4691393c016d8d65e6b11febca9e56feedef | tika-cpu_21 | God Component | MANY_CLASSES | 31 |
| org.apache.tika.detect | 49bb4691393c016d8d65e6b11febca9e56feedef | tika-cpu_21 | God Component | MANY_CLASSES | 31 |
| org.apache.tika.parser | 49bb4691393c016d8d65e6b11febca9e56feedef | tika-cpu_21 | God Component | MANY_CLASSES | 37 |
| org.apache.tika.mime | 49bb4691393c016d8d65e6b11febca9e56feedef | tika-cpu_21 | God Component | MANY_CLASSES | 31 |
| ... | ... | ... | ... | ... | ... |
| org.apache.tika.sax | 77d57d5506cfacf4e75d09e214643deda5a52047 | tika-cpu_17 | God Component | MANY_CLASSES | 35 |
| org.apache.tika.parser.txt | 77d57d5506cfacf4e75d09e214643deda5a52047 | tika-cpu_17 | God Component | MANY_CLASSES | 74 |
| org.apache.tika.sax | edb6775bf356eaaf656730589cc3340a15b602ea | tika-cpu_21 | God Component | MANY_CLASSES | 33 |
| org.apache.tika.parser.txt | edb6775bf356eaaf656730589cc3340a15b602ea | tika-cpu_21 | God Component | MANY_CLASSES | 71 |
| org.apache.tika.parser.microsoft | edb6775bf356eaaf656730589cc3340a15b602ea | tika-cpu_21 | God Component | MANY_CLASSES | 31 |
28077 rows × 4 columns
Dispose some columns.
all_reports = all_reports[['# classes']]
Add commits to report data, combining them into one big dataset, gcdata.
gcdata = all_reports.join(all_commits)
General statistics on lifetime.
# Compute date data
dt = gcdata.groupby('package')['datetime']
normalize_date = lambda date: pd.to_datetime(date, utc=True)\
.dt.tz_convert('Europe/Amsterdam')
dtmin = normalize_date(dt.min())
dtmax = normalize_date(dt.max())
became_gc = dtmin.dt.strftime('%Y-%m-%d')
last_gc = dtmax.dt.strftime('%Y-%m-%d')
gc_commits = dt.count()
gc_days = (dtmax - dtmin).dt.days
# Compute author data
authors = gcdata.groupby('package')['author'].unique()
n_authors = authors.transform(lambda x: len(x))
# DataFrame
stats = pd.DataFrame([became_gc, last_gc, gc_commits, gc_days, n_authors],
['Became GC at', 'Last seen as GC', '# GC commits', '# GC days', '# authors'])\
.transpose().reset_index()
stats
| package | Became GC at | Last seen as GC | # GC commits | # GC days | # authors | |
|---|---|---|---|---|---|---|
| 0 | org.apache.tika.batch | 2015-06-28 | 2020-12-14 | 2352 | 1996 | 114 |
| 1 | org.apache.tika.detect | 2017-01-19 | 2020-12-14 | 1579 | 1424 | 82 |
| 2 | org.apache.tika.example | 2015-05-04 | 2020-12-14 | 2433 | 2050 | 115 |
| 3 | org.apache.tika.fork | 2018-05-31 | 2020-12-14 | 738 | 927 | 52 |
| 4 | org.apache.tika.metadata | 2016-09-26 | 2020-12-14 | 1743 | 1539 | 88 |
| 5 | org.apache.tika.mime | 2015-05-02 | 2020-12-14 | 2443 | 2053 | 115 |
| 6 | org.apache.tika.parser | 2015-02-21 | 2020-12-14 | 2429 | 2123 | 116 |
| 7 | org.apache.tika.parser.microsoft | 2011-11-25 | 2020-12-14 | 3052 | 3306 | 121 |
| 8 | org.apache.tika.parser.microsoft.chm | 2020-08-21 | 2020-12-14 | 155 | 114 | 14 |
| 9 | org.apache.tika.parser.microsoft.onenote | 2019-12-16 | 2020-12-14 | 305 | 363 | 26 |
| 10 | org.apache.tika.parser.microsoft.ooxml | 2017-03-23 | 2020-12-14 | 1454 | 1361 | 80 |
| 11 | org.apache.tika.parser.txt | 2009-05-22 | 2020-12-14 | 4539 | 4223 | 123 |
| 12 | org.apache.tika.sax | 2011-09-25 | 2020-12-14 | 3691 | 3367 | 121 |
| 13 | org.apache.tika.server | 2014-05-07 | 2020-12-14 | 1078 | 2412 | 47 |
| 14 | org.apache.tika.utils | 2020-10-31 | 2020-12-14 | 86 | 44 | 8 |
'Average GC lifetime: ' + str(gc_days.mean() / 365) + ' years'
'Average GC lifetime: 4.986666666666667 years'
Average over the above stats
stats.agg({
'# GC commits': 'mean',
'# GC days': 'mean',
'# authors': 'mean'
})
# GC commits 1871.800000 # GC days 1820.133333 # authors 81.466667 dtype: float64
Total amount of God Components
total_gcs = gcdata.groupby(['commit', 'datetime']).count().reset_index()
sns.lineplot(data=total_gcs, x='datetime', y='# classes')
plt.ylabel('# god components')
Text(0, 0.5, '# god components')
Amount of classes per God Component
fig, ax = plt.subplots(figsize=(10, 4.2))
g = sns.lineplot(data=gcdata.sort_values('package'),
x='datetime', y='# classes', hue='package', ax=ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x11dfeed60>
# classes chronological difference (delta)¶gcdelta = all_reports.groupby('package')\
.apply(lambda df: df.merge(all_commits, on='commit', how='right'))
gcdelta['# classes diff'] = gcdelta['# classes'].diff(periods=-1).fillna(0).astype(int)
gcdelta['is gc'] = gcdelta['# classes'] > 0
gcdelta['is gc diff'] = gcdelta['is gc'].diff(periods=-1).fillna(False)
gcdelta['anchor'] = 0
gcdelta['# classes added'] = gcdelta[['anchor', '# classes diff']].max(axis=1)
gcdelta['# classes removed'] = gcdelta[['anchor', '# classes diff']].min(axis=1)
gcdelta = gcdelta.drop(columns=['anchor'])
print('gcdelta has {} total rows'.format(len(gcdelta)))
gcdelta[gcdelta['# classes diff'] > 0]
gcdelta has 74760 total rows
| # classes | author | datetime | jira | # classes diff | is gc | is gc diff | # classes added | # classes removed | ||
|---|---|---|---|---|---|---|---|---|---|---|
| package | commit | |||||||||
| org.apache.tika.batch | 30c3d8104a51f015416382995435a4785059f07c | 32.0 | TALLISON | 2018-11-13 14:08:26-05:00 | TIKA-2778 | 1 | True | False | 1 | 0 |
| org.apache.tika.detect | 8b82b4c942c82f9cc2eb393e669227606b5f15fc | 37.0 | tallison | 2020-12-04 13:16:28-05:00 | TIKA-3218 | 1 | True | False | 1 | 0 |
| a43784b19f6b0955478dded71521b0491d21c90b | 36.0 | tallison | 2020-12-02 11:58:44-05:00 | TIKA-3241 | 1 | True | False | 1 | 0 | |
| ee8caf69456bd52d278283792dc1b9c56477c243 | 36.0 | tallison | 2020-10-29 10:57:03-04:00 | TIKA-3215 | 4 | True | False | 4 | 0 | |
| 70ca280f11fe4127df290b8027c6bc1d5180271f | 32.0 | lfcnassif | 2017-09-08 12:36:48-03:00 | TIKA-2460 | 1 | True | False | 1 | 0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| org.apache.tika.server | f776fc07aecdf4e97343cba7ee7d35d1cd9743df | 39.0 | Tim Allison | 2014-12-19 03:12:38+00:00 | TIKA-1497 | 1 | True | False | 1 | 0 |
| 57db5ef21c95f49aa19b9da69a69d27a2a1da5ad | 38.0 | Tim Allison | 2014-12-18 19:50:52+00:00 | TIKA-1498 | 4 | True | False | 4 | 0 | |
| 95051f2a352f3e7ee83acf203d75b803a26c3f4f | 34.0 | Sergey Beryozkin | 2014-08-06 08:06:52+00:00 | TIKA-1371 | 1 | True | False | 1 | 0 | |
| 583af266a28f6494c29fcd62fa78fcc32199cc2e | 33.0 | Chris Mattmann | 2014-06-14 18:52:44+00:00 | TIKA-1336 | 2 | True | False | 2 | 0 | |
| org.apache.tika.utils | 8b82b4c942c82f9cc2eb393e669227606b5f15fc | 32.0 | tallison | 2020-12-04 13:16:28-05:00 | TIKA-3218 | 1 | True | False | 1 | 0 |
144 rows × 9 columns
import matplotlib.dates as md
fig, ax = plt.subplots(figsize=(10, 4.2))
# scatter `_` symbols such that it looks like a continuous line.
sns.scatterplot(data=gcdelta[gcdelta['is gc']],\
x='datetime', y='package', hue='package',\
ax=ax, legend=False, marker='_', edgecolor=None)
# start- and ending markers for GC lifetime.
sns.scatterplot(data=gcdelta[gcdelta['is gc diff']],\
x='datetime', y='package', hue='package',\
ax=ax, edgecolor=None, markers={False: '.', True: '>'},\
style='is gc', alpha=0.75)
ax.xaxis.set_major_locator(md.YearLocator())
ax.xaxis.set_major_formatter(md.DateFormatter('%Y'))
ax.axes.yaxis.set_visible(False)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x1255b6f70>
Save a small version of is gc metric the # classes dataframe only where abstract difference > 3
gcdelta[gcdelta['# classes diff'].abs() > 3]\
.sort_values(['package', 'datetime'], ascending=True)\
.to_csv('output/diffs_nclasses.csv',
columns=['# classes', '# classes diff', '# classes added', '# classes removed', 'author', 'datetime', 'jira'],
float_format='%.f')
gcdelta[gcdelta['is gc diff']]\
.sort_values(['package', 'datetime'], ascending=True)\
.to_csv('output/diffs_isgc.csv',
columns=['is gc', '# classes', '# classes diff', 'author', 'datetime', 'jira'],
float_format='%.f')
Load up data on Lines Of Code for every God Component at the state of every commit.
all_locs = pd.read_csv('output/all_locs.csv', index_col=['package', 'commit'])
all_locs
| additions | deletions | LOC | change | ||
|---|---|---|---|---|---|
| package | commit | ||||
| org.apache.tika.batch | fe4cd58cced0e15f1848afbc2518ab4a66e6867f | 6638 | 0 | 6638 | 6638 |
| dbc5eb76b7a21587eba83724e36bff1cb29e72c6 | 251 | 227 | 6662 | 24 | |
| 8054dddc156444a0e67a66fcd2dbc9b4bb5db4b7 | 13 | 14 | 6661 | -1 | |
| 4ae33b70378e0ee66b7d9c95d6fd7d51b10cc658 | 28 | 26 | 6663 | 2 | |
| 6a013f550bf7e8cfbed4652ec3785e1627d0f7ea | 353 | 334 | 6682 | 19 | |
| ... | ... | ... | ... | ... | ... |
| org.apache.tika.utils | 813a6eb69853d87ae54fc4bf6183267bb480322d | 2 | 2 | 4969 | 0 |
| 7fd2825613bce5fd741a0126de8d812a316389ba | 2 | 2 | 4969 | 0 | |
| dd85c73094dc87f6f6e208278325be810761f490 | 1 | 1 | 4969 | 0 | |
| 326b7d7abda238ccc181fc2acaf2271e08fe9c1b | 1 | 1 | 4969 | 0 | |
| 7f65d61b4fe1f4c1d4929bbe1c456c68afd44b6c | 1 | 1 | 4969 | 0 |
4505 rows × 4 columns
Add commit datetime.
locdata = all_locs.join(all_commits)
fig, ax = plt.subplots(figsize=(10, 4.2))
g = sns.lineplot(data=locdata, x='datetime', y='LOC', hue='package', ax=ax)
g.set(yscale='log')
g.yaxis.set_major_formatter(ticker.ScalarFormatter())
plt.ylabel('Lines Of Code (LOC)')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x11e3263a0>
How many developers contributed to God Components? We aim to answer the question in terms of both God Component (1) buildup and (2) refactoring. We do this, by considering the # classes added and # classes removed for each developer.
gcdelta['# classes changed'] = gcdelta['# classes diff'].abs()
classes_developers = gcdelta[gcdelta['# classes changed'] > 0]\
.groupby('author')\
.agg('sum')[['# classes changed', '# classes added', '# classes removed']]
totalclasses = classes_developers['# classes changed'].sum()
classes_developers['% contributed'] = classes_developers['# classes changed'] / totalclasses * 100
classes_developers.sort_values('% contributed', ascending=False)
| # classes changed | # classes added | # classes removed | % contributed | |
|---|---|---|---|---|
| author | ||||
| tballison | 138 | 67 | -71 | 20.474777 |
| Nick Burch | 85 | 42 | -43 | 12.611276 |
| tallison | 73 | 41 | -32 | 10.830861 |
| Thamme Gowda | 60 | 54 | -6 | 8.902077 |
| Tim Allison | 57 | 54 | -3 | 8.456973 |
| Lewis John McGibbney | 49 | 23 | -26 | 7.270030 |
| TALLISON | 46 | 29 | -17 | 6.824926 |
| Giuseppe Totaro | 30 | 28 | -2 | 4.451039 |
| Chris Mattmann | 23 | 8 | -15 | 3.412463 |
| amensiko | 18 | 11 | -7 | 2.670623 |
| ThejanW | 11 | 0 | -11 | 1.632047 |
| Cameron Rollheiser | 11 | 0 | -11 | 1.632047 |
| lfcnassif | 9 | 2 | -7 | 1.335312 |
| Sergey Beryozkin | 9 | 9 | 0 | 1.335312 |
| Jukka Zitting | 9 | 7 | -2 | 1.335312 |
| Matthew Caruana Galizia | 8 | 0 | -8 | 1.186944 |
| manali | 6 | 2 | -4 | 0.890208 |
| Konstantin Gribov | 5 | 4 | -1 | 0.741840 |
| Tom Barber | 5 | 3 | -2 | 0.741840 |
| Tyler Palsulich | 3 | 3 | 0 | 0.445104 |
| avtar singh | 3 | 3 | 0 | 0.445104 |
| nandan-pc | 3 | 0 | -3 | 0.445104 |
| John Patrick | 3 | 3 | 0 | 0.445104 |
| Bob Paulin | 3 | 0 | -3 | 0.445104 |
| Maxim Valyanskiy | 2 | 2 | 0 | 0.296736 |
| Ray Gauss II | 1 | 1 | 0 | 0.148368 |
| Nicholas DiPiazza | 1 | 1 | 0 | 0.148368 |
| Lee | 1 | 1 | 0 | 0.148368 |
| Kenneth William Krugler | 1 | 1 | 0 | 0.148368 |
| smadha | 1 | 0 | -1 | 0.148368 |
top5devs = classes_developers.nlargest(5, '# classes changed').index.values
top5devdata = gcdelta[gcdelta['author'].isin(top5devs)]
top5devdata[['']]
# top5devdata.plot.bar(stacked=True)
# plt.ylabel('# classes')
# sns.catplot(data=top5devdata.reset_index(), x='package', y='# classes diff', hue='author', col='package')
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-323-407142a4df54> in <module> 1 top5devs = classes_developers.nlargest(5, '# classes changed').index.values 2 top5devdata = gcdelta[gcdelta['author'].isin(top5devs)] ----> 3 top5devdata.plot.bar(stacked=True) 4 # plt.ylabel('# classes') 5 # sns.catplot(data=top5devdata.reset_index(), x='package', y='# classes diff', hue='author', col='package') ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_core.py in bar(self, x, y, **kwargs) 1105 other axis represents a measured value. 1106 """ -> 1107 return self(kind="bar", x=x, y=y, **kwargs) 1108 1109 @Appender( ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_core.py in __call__(self, *args, **kwargs) 947 data.columns = label_name 948 --> 949 return plot_backend.plot(data, kind=kind, **kwargs) 950 951 __call__.__doc__ = __doc__ ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_matplotlib/__init__.py in plot(data, kind, **kwargs) 59 kwargs["ax"] = getattr(ax, "left_ax", ax) 60 plot_obj = PLOT_CLASSES[kind](data, **kwargs) ---> 61 plot_obj.generate() 62 plot_obj.draw() 63 return plot_obj.result ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_matplotlib/core.py in generate(self) 269 self._compute_plot_data() 270 self._setup_subplots() --> 271 self._make_plot() 272 self._add_table() 273 self._make_legend() ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_matplotlib/core.py in _make_plot(self) 1426 start = np.where(mask, pos_prior, neg_prior) + self._start_base 1427 w = self.bar_width / 2 -> 1428 rect = self._plot( 1429 ax, 1430 self.ax_pos + w, ~/Library/Python/3.8/lib/python/site-packages/pandas/plotting/_matplotlib/core.py in _plot(cls, ax, x, y, w, start, log, **kwds) 1371 @classmethod 1372 def _plot(cls, ax, x, y, w, start=0, log=False, **kwds): -> 1373 return ax.bar(x, y, w, bottom=start, log=log, **kwds) 1374 1375 @property ~/Library/Python/3.8/lib/python/site-packages/matplotlib/__init__.py in inner(ax, data, *args, **kwargs) 1445 def inner(ax, *args, data=None, **kwargs): 1446 if data is None: -> 1447 return func(ax, *map(sanitize_sequence, args), **kwargs) 1448 1449 bound = new_sig.bind(ax, *args, **kwargs) ~/Library/Python/3.8/lib/python/site-packages/matplotlib/axes/_axes.py in bar(self, x, height, width, bottom, align, **kwargs) 2492 elif orientation == 'horizontal': 2493 r.sticky_edges.x.append(l) -> 2494 self.add_patch(r) 2495 patches.append(r) 2496 ~/Library/Python/3.8/lib/python/site-packages/matplotlib/axes/_base.py in add_patch(self, p) 2031 if p.get_clip_path() is None: 2032 p.set_clip_path(self.patch) -> 2033 self._update_patch_limits(p) 2034 self.patches.append(p) 2035 p._remove_method = self.patches.remove ~/Library/Python/3.8/lib/python/site-packages/matplotlib/axes/_base.py in _update_patch_limits(self, patch) 2051 vertices = patch.get_path().vertices 2052 if vertices.size > 0: -> 2053 xys = patch.get_patch_transform().transform(vertices) 2054 if patch.get_data_transform() != self.transData: 2055 patch_to_data = (patch.get_data_transform() - ~/Library/Python/3.8/lib/python/site-packages/matplotlib/patches.py in get_patch_transform(self) 790 791 def get_patch_transform(self): --> 792 self._update_patch_transform() 793 return self._rect_transform 794 ~/Library/Python/3.8/lib/python/site-packages/matplotlib/patches.py in _update_patch_transform(self) 772 rot_trans = transforms.Affine2D() 773 rot_trans.rotate_deg_around(x0, y0, self.angle) --> 774 self._rect_transform = transforms.BboxTransformTo(bbox) 775 self._rect_transform += rot_trans 776 ~/Library/Python/3.8/lib/python/site-packages/matplotlib/transforms.py in __init__(self, boxout, **kwargs) 2519 Affine2DBase.__init__(self, **kwargs) 2520 self._boxout = boxout -> 2521 self.set_children(boxout) 2522 self._mtx = None 2523 self._inverted = None ~/Library/Python/3.8/lib/python/site-packages/matplotlib/transforms.py in set_children(self, *children) 193 # parents are destroyed, references from the children won't 194 # keep them alive. --> 195 for child in children: 196 # Use weak references so this dictionary won't keep obsolete nodes 197 # alive; the callback deletes the dictionary entry. This is a KeyboardInterrupt:
'{} developers added or removed classes to Tika'.format(len(classes_developers))
'30 developers added or removed classes to Tika'
Also, show that there are not a lot of developers working on the entire project at all. Developers versus LOC's added/removed:
locs_developers = locdata.groupby('author').agg('sum').drop(columns=['LOC'])
totalchange = locs_developers['change'].sum()
locs_developers['% contributed'] = locs_developers['change'] / totalchange * 100
locs_developers.nlargest(15, '% contributed')
| additions | deletions | change | % contributed | |
|---|---|---|---|---|
| author | ||||
| Chris Mattmann | 47236 | 13721 | 33515 | 13.536492 |
| tballison | 48095 | 17166 | 30929 | 12.492023 |
| Jukka Zitting | 77686 | 47854 | 29832 | 12.048952 |
| Nick Burch | 42384 | 12628 | 29756 | 12.018256 |
| Madhav Sharan | 27264 | 3846 | 23418 | 9.458379 |
| Tim Allison | 97122 | 74176 | 22946 | 9.267741 |
| TALLISON | 16716 | 8493 | 8223 | 3.321217 |
| Lewis John McGibbney | 10185 | 2012 | 8173 | 3.301022 |
| tallison | 50551 | 43257 | 7294 | 2.945999 |
| Oleg Tikhonov | 13201 | 6064 | 7137 | 2.882588 |
| Thamme Gowda | 9129 | 2918 | 6211 | 2.508583 |
| Nicholas DiPiazza | 6339 | 248 | 6091 | 2.460116 |
| avtar singh | 4368 | 717 | 3651 | 1.474615 |
| Tom Barber | 3955 | 667 | 3288 | 1.328002 |
| Michael McCandless | 4379 | 1333 | 3046 | 1.230260 |
Add the Tika Jira issue tracker information as a data source.
all_issues = pd.read_csv('output/all_issues.csv', dtype={'issuetype': 'category'},
index_col='jira')
all_issues = all_issues.drop(columns=['id', 'self', 'reporter', 'updated'])
all_issues
| resolution | priority | assignee | status | creator | issuetype | resolutiondate | created | components | |
|---|---|---|---|---|---|---|---|---|---|
| jira | |||||||||
| TIKA-3256 | Fixed | Minor | tilman | Resolved | tilman | Task | 2020-12-27T13:08:45.000+0000 | 2020-12-27T13:07:15.000+0000 | general |
| TIKA-3255 | NaN | Major | NaN | Open | peterkronenberg | Bug | NaN | 2020-12-22T17:04:55.000+0000 | NaN |
| TIKA-3254 | NaN | Major | NaN | Open | sathia | Bug | NaN | 2020-12-22T13:49:17.000+0000 | NaN |
| TIKA-3253 | NaN | Minor | NaN | Open | tilman | Improvement | NaN | 2020-12-17T08:57:18.000+0000 | tika-eval |
| TIKA-3252 | NaN | Trivial | NaN | Open | tilman | Bug | NaN | 2020-12-17T08:38:45.000+0000 | documentation |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TIKA-5 | Fixed | Major | chrismattmann | Closed | chrismattmann | New Feature | 2007-06-08T22:55:39.000+0000 | 2007-05-31T21:49:14.000+0000 | general |
| TIKA-4 | Fixed | Major | jukkaz | Closed | jukkaz | Task | 2007-03-31T13:48:32.000+0000 | 2007-03-31T06:58:59.000+0000 | NaN |
| TIKA-3 | Fixed | Major | jukkaz | Closed | jukkaz | Task | 2007-03-31T09:33:24.000+0000 | 2007-03-31T06:57:02.000+0000 | NaN |
| TIKA-2 | Fixed | Major | jukkaz | Closed | jukkaz | Task | 2007-03-31T13:49:41.000+0000 | 2007-03-31T06:55:41.000+0000 | documentation |
| TIKA-1 | Fixed | Major | jukkaz | Closed | jukkaz | Task | 2007-03-31T07:43:18.000+0000 | 2007-03-31T06:54:09.000+0000 | general |
3248 rows × 9 columns
Keep only issue key and issue type columns.
all_issues = all_issues[['issuetype']]
Generally, how many and what types of issues are in Apache Tika's Jira issue tracker?
print(all_issues.value_counts().sum(), 'issues total.')
all_issues.value_counts()
3248 issues total.
issuetype Bug 1592 Improvement 1056 Task 260 New Feature 219 Sub-task 65 Wish 46 Test 10 dtype: int64
... of which these amounts are involved in God Component commits:
issuedata = gcdata.reset_index().merge(all_issues, on='jira') # combine with GC data
issuetypes = issuedata[['jira', 'issuetype']].drop_duplicates()[['issuetype']]
print(issuetypes.value_counts().sum(), 'issues total related to GC commits.')
issuetypes.value_counts()
1694 issues total related to GC commits.
issuetype Bug 708 Improvement 643 Task 178 New Feature 107 Sub-task 37 Wish 17 Test 4 dtype: int64
... which is this percentage:
issuetypes.value_counts() / all_issues.value_counts()
issuetype Bug 0.444724 Improvement 0.608902 Task 0.684615 New Feature 0.488584 Sub-task 0.569231 Wish 0.369565 Test 0.400000 dtype: float64
We can also check what issue types are represented most in the God Component commits:
issuetypes.value_counts(normalize=True)
issuetype Bug 0.417946 Improvement 0.379575 Task 0.105077 New Feature 0.063164 Sub-task 0.021842 Wish 0.010035 Test 0.002361 dtype: float64
Build a pivot table and show heatmap.
issuedata = gcdata.reset_index().merge(all_issues, on='jira') # combine with GC data
typecounts = issuedata.set_index(['package', 'issuetype']).index.value_counts()
typecountdf = pd.DataFrame({ 'count': typecounts.values },
index=pd.MultiIndex.from_tuples(typecounts.keys(), names=['package', 'issuetype']))\
.reset_index()
typecountdf
| package | issuetype | count | |
|---|---|---|---|
| 0 | org.apache.tika.parser.txt | Improvement | 1252 |
| 1 | org.apache.tika.parser.txt | Bug | 1164 |
| 2 | org.apache.tika.sax | Improvement | 961 |
| 3 | org.apache.tika.sax | Bug | 906 |
| 4 | org.apache.tika.parser.microsoft | Improvement | 721 |
| ... | ... | ... | ... |
| 85 | org.apache.tika.server | Test | 3 |
| 86 | org.apache.tika.parser.microsoft.chm | New Feature | 2 |
| 87 | org.apache.tika.parser | Test | 2 |
| 88 | org.apache.tika.fork | Sub-task | 1 |
| 89 | org.apache.tika.utils | New Feature | 1 |
90 rows × 3 columns
rectangular = typecountdf.pivot(index='package', columns='issuetype', values='count')
f, ax = plt.subplots()
plt.yticks(rotation=0)
plt.xticks(rotation=45)
g = sns.heatmap(rectangular,\
annot=True, fmt='.0f', linewidths=.5,\
cmap="YlGnBu", yticklabels=True, ax=ax)
plt.title('Amount of commits related to issue types per GC')
Text(0.5, 1.0, 'Amount of commits related to issue types per GC')
This time around, only include those commits that actually 'build up' or 'decrease' the size of a God Component; i.e. those commits that actually: add or remove classes to a GC.
gd = gcdelta.reset_index().merge(all_issues, on='jira')
netto = gd.groupby('issuetype').sum()[['# classes added', '# classes removed']]
netto
| # classes added | # classes removed | |
|---|---|---|
| issuetype | ||
| Bug | 48 | -51 |
| Improvement | 143 | -102 |
| New Feature | 24 | -16 |
| Sub-task | 4 | -4 |
| Task | 56 | -39 |
| Test | 0 | 0 |
| Wish | 0 | 0 |
netto.plot.bar(stacked=True)
plt.ylabel('# classes')
Text(0, 0.5, '# classes')